#encoding:utf8
'''
抓取网页的内容解析类
通过item的规则解析网页内的指定内容
'''
import bs4
class Item(object):
    def __init__(self,ruleType,rule,attr=None):
        self.rule=rule
        self.attr=attr

    def getContent(self,page,parser='html5lib',from_encoding='utf8'):
        soup=bs4.BeautifulSoup(page,parser,from_encoding=from_encoding)
        contentList=[]
        for i in soup.select(self.rule):
            if self.attr!=None:
                contentList.append(i[self.attr])
            else:
                contentList.append(i)
        return contentList

if __name__=='__main__':
    from crawler import Crawler
    url='http://novel.hongxiu.com/a/1222858/12139073.html'
    html=Crawler().openUrl(url)

    urlRule='a[id="htmlxiazhang"]'
    urlAttr='href'
    urlItem=Item(1,urlRule,urlAttr)
    urls=urlItem.getContent(html)

    contentRule='div#htmlContent'
    contentAttr=None
    contentItem=Item(1,contentRule,contentAttr)
    contents=contentItem.getContent(html)
    print contents
    #for i in items:
    #    print i['href']
